import pandas as pd
data_raw = pd.read_csv('data/autoscout24-germany-dataset.csv')
data_raw.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46405 entries, 0 to 46404
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 mileage 46405 non-null int64
1 make 46405 non-null object
2 model 46262 non-null object
3 fuel 46405 non-null object
4 gear 46223 non-null object
5 offerType 46405 non-null object
6 price 46405 non-null int64
7 hp 46376 non-null float64
8 year 46405 non-null int64
dtypes: float64(1), int64(3), object(5)
memory usage: 3.2+ MB
data = data_raw.dropna().copy()
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 46071 entries, 0 to 46404
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 mileage 46071 non-null int64
1 make 46071 non-null object
2 model 46071 non-null object
3 fuel 46071 non-null object
4 gear 46071 non-null object
5 offerType 46071 non-null object
6 price 46071 non-null int64
7 hp 46071 non-null float64
8 year 46071 non-null int64
dtypes: float64(1), int64(3), object(5)
memory usage: 3.5+ MB
# extract data
x_km = data.loc[:, 'mileage'].values
y = data.loc[:, 'price'].values
# plot
import matplotlib.pylab as plt
plt.style.use('bmh')
fig, ax = plt.subplots()
ax.scatter(x_km, y)
ax.set_xlabel('Kilometerstand')
ax.set_ylabel('Preis')
ax.set_title('Gebrauchtwagenmarkt 2011-2021 (Autoscout24)');
import plotly.express as px
# extract data
x_km = data.loc[:, 'mileage'].values
y = data.loc[:, 'price'].values
fig = px.scatter(data, x = 'mileage', y = 'price', title='Gebrauchtwagenmarkt 2011-2021 (Autoscout24)')
fig.update_layout(
xaxis_title = 'Kilometerstand [km]',
yaxis_title = 'Preis (Euro)'
)
fig.show()